library(ggplot2)
library(showtext)## Add fonts that are available on Windows
showtext.auto(enable = TRUE)
font_add("heiti", "simhei.ttf")

c = read.csv('keywordCoverageSet_keyword_prov.txt', stringsAsFactor = F, header = F, sep = " ")
names(c) <- c("num", "keyword","province", "prop")
## rank provinces by their area under the curve
library(DescTools)
library(dplyr)


auc_own = function(x,y)
{
  AUC(x, y, method = c("trapezoid"), na.rm = FALSE)
}

auc_prov = c %>%
  group_by(province) %>%
  mutate (num_prop = num / n()) %>%
  summarise(auc = auc_own (num_prop, prop))   %>%
  ungroup %>%
  as.data.frame()

auc_prov = auc_prov[order(auc_prov$auc, decreasing =T),]

c$province = factor(c$province, levels = auc_prov$province)


ggplot(c, aes(x = num, y = prop, color = province)) + 
  geom_line(aes(province), size = 0.05) +
  geom_point(size = 0.01) +
  geom_vline(xintercept = 50, linetype = "dotdash")+
  scale_y_continuous(breaks = seq(0, 1, 0.05), limits = c(0, 1)) +
  scale_x_discrete(breaks = seq(0, 1000, 5), limits = c(0, 1000)) +
  scale_color_grey() +
  theme(text = element_text(family = 'SimSun')) +
  xlab ("Size of Keyword Dictionary K") +
  ylab ("Proportion of Wickedonna posts that contains words in K") +
  theme(axis.text.y= element_text(size = 16, colour = 'black'),
        axis.title.y=element_text(size = 14, colour = 'black')) +
  theme(axis.text.x= element_text(size = 16, colour = 'black'),
        axis.title.x=element_text(size = 15, colour = 'black')) +
  theme_bw()

ggsave("keyword_coverage_wickedonna_by_province.pdf", device = "pdf", width = 12, height = 8, units = "in")


